Explore Endowment Values Over Time

# starting by only exploring the CY* variables
endowment_data <- read_rds(here("data", 
                                "endowments_by_most_recent_filings.RDS")) 

companies_to_ein <- read_csv(here("data", "companies.csv")) %>%
  mutate(EIN = as.character(ein)) %>%
  select(EIN, organization_name)
endowment_data <- read_rds(here("data", 
                                "endowment_filter_data_990.RDS")) %>%
  select(EIN, fiscal_year, contains("CY"))


# only include EINs that have at least one observation of 
# one of the endowment variables
include_eins <- endowment_data %>%
  pivot_longer(-c(EIN,fiscal_year)) %>%
  group_by(EIN) %>%
  summarize(na_count = sum(is.na(value)),
             total_rows = n()) %>%
  filter(na_count < total_rows) %>%
  pull(EIN) %>% unique()


# taking most recently available information from the CY* variables
# this is to take into account findings from cross referencing that 
# values are not always in correspondence
# when they are in correspondence, this will be equivalent to just taking
# the CY data from each fiscal year 
endowment_data <- endowment_data %>%
  filter(EIN %in% include_eins) %>% 
    group_by(EIN) %>%
    pivot_longer(3: ncol(.),
                 names_to = "variable_name") %>%
    mutate(source = ifelse(grepl("CYM", variable_name),
                           substr(variable_name, 1,4), "CY"),
           # get the year lag from the CYM variable name
           # CYM1 corresponds to 1 year lag
           year_lag = ifelse(grepl("CYM", variable_name), 
                             substr(source, 4,4), 0),
           year_lag = as.numeric(year_lag),
           fiscal_year = as.numeric(paste0(fiscal_year)),
           # get rid of CY or CMX part of the variable name (where X is a year)
           variable_name = gsub("CY|CYM.", "", variable_name)) %>%
    mutate(value_year = fiscal_year -year_lag
           ) %>%
  group_by(EIN, value_year, variable_name) %>%
  arrange(EIN, variable_name, fiscal_year) %>%
  # pick the most recent one available
  slice_max(n = 1, order_by = fiscal_year)  %>%
  select(EIN, value_year, variable_name, source, value) %>%
  rename(fiscal_year=value_year) %>%
  ungroup() %>%
  # add company names
  left_join(companies_to_ein)
endowment_data <- read_rds(here("data", 
                                "endowments_by_most_recent_filings.RDS")) %>%
  select(-c(EndowmentsHeldUnrelatedOrgInd, EndowmentsHeldRelatedOrgInd)) %>%
  pivot_longer(-c(EIN, fiscal_year),
               names_to = "variable_name") %>%
  left_join(companies_to_ein)
# this function splits the data into 4 groups based on the mean value for that EIN 
# and plots the variable over time, faceted by group
# this is to handle the fact that the values are on very different scales,
# so if we plot them all together, it masks any interesting trends
# dotted lines represent where there was a missing year between observations
plot_variable <- function(var) {
  
  # get eins with at least one observation of the variable
  eins_with_variable <- endowment_data %>%
  filter(variable_name == var) %>%
    group_by(EIN) %>%
    summarize(number_observations = sum(!is.na(value))) %>%
    filter(number_observations != 0) %>%
    pull(EIN)
  
  # split into quantiles based on EIN mean
  data <- endowment_data %>%
      filter(EIN %in% eins_with_variable & variable_name == var) %>%
      group_by(EIN) %>%
      mutate(EIN_mean = mean(value,na.rm= TRUE)) %>%
      ungroup() %>%
      group_by(variable_name) %>%
      mutate(quantile_group = ntile(EIN_mean, n = 4)) %>%
      group_by(EIN) %>% 
      # make sure EIN has single quantile group
      mutate(quantile_group = max(quantile_group)) %>%
      mutate(quantile_group_labels = factor(paste0("Quantile ", quantile_group))) %>%
      mutate(organization_name = ifelse(is.na(organization_name),
                                        "Not Available", 
                                        organization_name)) %>%
    ungroup()
  
  # data with no nas so we can connect the values wehre there was a missing year
  # instead of just having no line connecting points from years on each side of the gap
  data_no_nas <- data %>% filter(!is.na(value))
  
  data %>%
    ggplot(aes(x = fiscal_year, y = value, color = EIN, label = organization_name)) +
    geom_line(data = data_no_nas,
              aes( 
                  x = fiscal_year,
                  y = value, group = EIN),
              color = "darkgray",
              linetype = "dotted") +
    geom_point(size = .9) +
    geom_line() +
    facet_wrap(~fct_reorder(
      quantile_group_labels,
      .x = quantile_group), scales="free_y", ncol = 2) +
    scale_y_continuous(labels = comma) +
    scale_x_continuous(breaks = seq.int(2010, 2021, by =2)) +
    viridis::scale_color_viridis(option = "mako", discrete = TRUE, end=.94) +
    theme_bw() +
    labs(title = paste0("Change in ", var, " Over Time"),
         x = "Fiscal Year",
         y = paste0("value of ", var)) +
    theme(plot.title = element_text(size = 18, 
                                    hjust = .5, face="bold",
                                    margin =margin(.1,.1,10,.1)),
          plot.subtitle = element_text(hjust = .5, face="italic"),
          axis.text.x = element_text(size = 13),
          axis.title = element_text(size = 13, face = "bold"),
          legend.position = "none")
}

Static Plots

# plot each variable over time
walk(unique(endowment_data$variable_name),
     ~{plt <- plot_variable(.x) 
     print(plt)
     }
)

Interactive Plots

# margins
m <- list(
    l = 50,
    r = 50,
    b = 100,
    t = 150,
    pad = 0.5
)


# https://github.com/plotly/plotly.R/issues/570
plotlist <- map(unique(endowment_data$variable_name),
     ~{plt <- plot_variable(.x) 
      plt <- ggplotly(plt) %>% layout(height = 450,
                                      margin =m)

     }
)

htmltools::tagList(setNames(plotlist, NULL))







Plot Variables Against Each Other

# function to plot variables of interest against each other
plot_combo <- function(var1, var2, data) {
  
  var2 <- sym(var2)
  var1_range <- data %>% 
    pull(var1) %>% 
    range(na.rm=TRUE)
  var1_range <- var1_range[2]- var1_range[1]
  
  var2_range <-  data %>%
    pull(var2) %>%
    range(na.rm=TRUE)
  var2_range <- var2_range[2]- var2_range[1]
  
  quantile_var <- ifelse(var1_range > var2_range, var1, var2 )

  data %>%
    group_by(EIN) %>%
    mutate(maxval = max(!!sym(quantile_var),na.rm=TRUE)) %>%
    ungroup() %>%
    mutate( tile = ntile(maxval, 4)) %>%
    group_by(EIN) %>%
    mutate(tile = sample(tile, 1)) %>%
    filter(!is.na(tile)) %>%
    ggplot(aes(x = !!sym(var1), y = !!sym(var2), color = EIN)) +
    geom_point(alpha = .9) +
   # geom_line(alpha = .5) +
    facet_wrap(~tile, scales="free") +
    viridis::scale_color_viridis(discrete=TRUE,
                                 option = "rocket",
                                 end = .9) +
    theme_bw()+
      theme(plot.title = element_text(size = 18, 
                                      hjust = .5, face="bold",),
            plot.subtitle = element_text(hjust = .5, 
                                         face="italic",
                                         size = 16),
            axis.title = element_text(size = 13, 
                                      face = "bold"),
            legend.position = "none") +
    scale_x_continuous(labels=comma) +
    scale_y_continuous(labels=comma) +
    labs(title = paste0(var2, " versus ", var1),
         subtitle = "Fill by EIN")
  
  
}
variable_combinations <- t(combn(unique(endowment_data$variable_name), 2)) %>%
  as.data.frame()

pwalk(variable_combinations, ~{ 
 plt <- plot_combo(var1 = .x, var2 = .y, data = endowment_data_wide)
 print(plt)
 }
)

Plots by Rank

# function to plot variables of interest against each other
plot_ranks <- function(var1, var2, data) {

  
   plt <- data %>%
    group_by(fiscal_year) %>%
   # arrange(var1) %>%
    mutate("{var1}_rank" := rank(!!sym(var1))) %>%
#    arrange(var2) %>%
    mutate("{var2}_rank"  := rank(!!sym(var2))) %>%
    ggplot(aes(x = !!sym(glue("{var1}_rank" )), y =!!sym(glue("{var2}_rank" )),
               color  = organization_name,
               label =EIN
               )) +
    geom_point() +
    labs(x = paste0(var1, " Rank"),
         y =  paste0(var2, " Rank")) +
    theme_bw() +
    labs(title = glue("Rank of {var2}\nversus Rank of {var1}")) +
    viridis::scale_color_viridis(discrete=TRUE,
                                 option = "mako",
                                 end = .9) +
     facet_wrap(~fiscal_year)+
      theme(plot.title = element_text(size = 18, 
                                      hjust = .5, face="bold",),
            plot.subtitle = element_text(hjust = .5, 
                                         face="italic",
                                         size = 16),
            axis.title = element_text(size = 13, 
                                      face = "bold"))
  
  ggplotly(plt)

}

# function to plot variables of interest against each other
plot_combo <- function(var1, var2, data) {
  
  data %>%
    ggplot(aes(x = !!sym(var1), y = !!sym(var2), color = EIN)) +
    geom_point(alpha = .9) +
   # geom_line(alpha = .5) +
    facet_wrap(~fiscal_year) +
    viridis::scale_color_viridis(discrete=TRUE,
                                 option = "rocket",
                                 end = .9) +
    theme_bw()+
      theme(plot.title = element_text(size = 18, 
                                      hjust = .5, face="bold",),
            plot.subtitle = element_text(hjust = .5, 
                                         face="italic",
                                         size = 16),
            axis.title = element_text(size = 13, 
                                      face = "bold"),
            legend.position = "none") +
    scale_x_continuous(labels=comma) +
    scale_y_continuous(labels=comma) +
    labs(title = paste0(var2, " versus ", var1),
         subtitle = "Fill by EIN")
  
}


endowment_data_wide <- endowment_data %>% 
  pivot_wider(names_from=variable_name,
              values_from=value) 

Plotting Endowment Variables Against Each Other, By Year

# pairwise combinations of variables
variable_combinations <- t(combn(unique(endowment_data$variable_name), 2)) %>%
  as.data.frame()
pwalk(variable_combinations, ~{ 
 plt <- plot_combo(var1 = .x, var2 = .y, data = endowment_data_wide)
 print(plt)
 }
)
plotlist <- pmap(variable_combinations, ~{ 
 plt <- plot_ranks(var1 = .x, var2 = .y, data = endowment_data_wide)
 print(plt)
 
 }
)


htmltools::tagList(setNames(plotlist, NULL))

Compensation

source(here("GET_VARS.R"))

files <- dir(here("ballet_990_released_20230208"),
              full.names = TRUE)

comp <- map_df(files, ~get_df(filename = .x, schedule = "j"))


comp_clean <- comp %>%
  rename_with(.cols= everything(),
              ~gsub('/Return/ReturnData/IRS990ScheduleJ/', '', .)) %>% 
  select(-contains("Ind")) %>%
  select(fiscal_year, EIN,
         contains("RltdOrgOfficerTrstKeyEmplGrp")) %>%
  # only extract cols within the RltdOrgOfficerTrstKeyEmplGrp
  select(EIN, fiscal_year,
         matches("RltdOrgOfficerTrstKeyEmplGrp\\[.*.\\]/")) %>%
  pivot_longer(-c(EIN,fiscal_year)) %>%
  mutate(id = gsub("\\D", "", name),
       #  name_old = name,
          name = gsub(".*./", "", name),
         id = gsub("990", "", id))

  
comp_clean <- comp_clean %>%
  filter(!is.na(value)) %>%
  distinct() %>% 
  pivot_wider(names_from = name, values_from = value) 



comp_clean <- comp_clean %>%
  mutate(across(contains("Amt"), as.numeric))%>%
  mutate(TitleTxt=tolower(TitleTxt))
  
saveRDS(comp_clean, here("data", "schedj.RDS"))
comp_clean <- read_rds(here("data", "schedj.RDS"))%>%
  left_join(companies_to_ein)
# number of EINs with each type of title
comp_clean %>%
  group_by(TitleTxt) %>%
  summarize(`Number of EINs` = n_distinct(EIN)) %>% 
  arrange(desc(`Number of EINs`))
comp_clean %>%
  mutate(TitleTxt=tolower(TitleTxt)) %>%
  filter(!is.na(TitleTxt)) %>%
  group_by(TitleTxt) %>%
  summarize(`Number of Individuals in Position` = n()) %>% 
  arrange(desc(`Number of Individuals in Position`))
# missingness by variable
comp_clean %>%
  select(-c(EIN,fiscal_year,id)) %>%
  is.na() %>% 
  colSums() %>%
  as_tibble(rownames="Variable") %>%
  mutate(`Not Missing` = nrow(comp_clean) - value) %>%
  select(-value)
comp_clean %>% 
  group_by(TitleTxt) %>%
  mutate(m = median(BaseCompensationFilingOrgAmt, na.rm= TRUE)) %>%
  filter(!is.na(m)) %>%
  ungroup() %>%
  ggplot(aes(x=fct_reorder(TitleTxt,m),
             y = BaseCompensationFilingOrgAmt)) +
  geom_jitter() +
  coord_flip() +
  theme_bw() +
  labs(title = "Compensation by Title",
       x = "Title")+
  theme(plot.title = element_text(size = 18, 
                                      hjust = .5, face="bold",),
            plot.subtitle = element_text(hjust = .5, 
                                         face="italic",
                                         size = 16),
            axis.title = element_text(size = 13, 
                                      face = "bold"))

plt <- comp_clean %>%
  group_by(EIN, fiscal_year) %>%
  summarize(total_compensation = sum(BaseCompensationFilingOrgAmt)) %>%
  group_by(EIN) %>%
  mutate(m = median(total_compensation, na.rm= TRUE)) %>%
  ungroup() %>%
#  group_by(EIN) %>%
  mutate(tile = ntile(m,2)) %>%
  ggplot(aes(x=fiscal_year,
             y = total_compensation,
            color = EIN,
            group = EIN)) +
  geom_line() +
  geom_point() +
  labs(title = "Compensation to Highest Paid Employees",
       subtitle = "By EIN")+
    viridis::scale_color_viridis(discrete=TRUE,
                                 option = "rocket",
                                 end = .9) +
    theme_bw()+
    theme(plot.title = element_text(size = 18, 
                                      hjust = .5, face="bold",),
            plot.subtitle = element_text(hjust = .5, 
                                         face="italic",
                                         size = 16),
            axis.title = element_text(size = 13, 
                                      face = "bold")) +
  facet_wrap(~tile, scales = "free_y") +
  scale_y_continuous(labels = comma)

ggplotly(plt) %>%
  layout(height = 450, width = 600)
# plot compensation versus beginning year balance by fiscal year
comp <- comp_clean %>% 
  mutate(fiscal_year = as.numeric(paste(fiscal_year))) %>%
  left_join(endowment_data_wide)  %>%
  group_by(EIN, fiscal_year, BeginningYearBalanceAmt, organization_name) %>%
  summarize(total_compensation = sum(BaseCompensationFilingOrgAmt)) 
  
  
plt <- comp %>%
  ggplot(aes(x=BeginningYearBalanceAmt,
             y = total_compensation,
            color = EIN)) +
  geom_point() +
  facet_wrap(~fiscal_year, nrow = 2)+
    theme_bw()+
    theme(plot.title = element_text(size = 18, 
                                      hjust = .5, face="bold",),
            plot.subtitle = element_text(hjust = .5, 
                                         face="italic",
                                         size = 16),
            axis.title = element_text(size = 13, 
                                      face = "bold"))+
    viridis::scale_color_viridis(discrete=TRUE,
                                 option = "magma",
                                 end = .9)

ggplotly(plt) %>% layout(height = 450,
                         width = 600)
plot_ranks("BeginningYearBalanceAmt",
           "total_compensation", data = comp )